head(bike, n=4)
## ride_id rideable_type started_at ended_at
## 1 726C3A99FFCAE10C classic_bike 2022-08-18 18:08:30 2022-08-18 19:00:37
## 2 F1AC3AED5E7498FB classic_bike 2022-08-11 18:28:21 2022-08-11 18:44:35
## 3 9C93876268A75FD7 classic_bike 2022-08-28 19:40:43 2022-08-28 20:52:50
## 4 45AFFC2B7A7BD7C9 classic_bike 2022-08-15 20:21:00 2022-08-15 20:44:37
## start_station_name start_station_id
## 1 Grandview Library at Oakland Ave 79
## 2 Jaeger St & Whittier St 59
## 3 High St & Crestview Rd 88
## 4 High St & Crestview Rd 88
## end_station_name end_station_id start_lat start_lng end_lat
## 1 Grandview Library at Oakland Ave 79 39.98193 -83.04898 39.98193
## 2 Jaeger St & Whittier St 59 39.94460 -82.98950 39.94460
## 3 High St & Crestview Rd 88 40.02252 -83.01364 40.02252
## 4 High St & Crestview Rd 88 40.02252 -83.01364 40.02252
## end_lng member_casual
## 1 -83.04898 member
## 2 -82.98950 member
## 3 -83.01364 member
## 4 -83.01364 casual
print(paste(sum(is.na(bike)), "number of NA in the data"))
## [1] "2131 number of NA in the data"
sapply(bike, function(y) sum(length(which(is.na(y)))))
## ride_id rideable_type started_at ended_at
## 0 0 0 0
## start_station_name start_station_id end_station_name end_station_id
## 0 870 0 1231
## start_lat start_lng end_lat end_lng
## 0 0 15 15
## member_casual
## 0
Most of Na in the columns start_station_id and and
end_station_id. However, since the name of the station is
present I don’t think we should delete the whole row!
# change rideable_type/ member_casual to factor
#there three numbers under the factor rideable_type needs to look into!
bike$rideable_type <- factor(bike$rideable_type)
bike$member_casual <- factor(bike$member_casual)
# split started_at/ ended_at to date column and time column
# change start_date/end_date to date type
bike$start_time <- format(as.POSIXct(bike$started_at), format = "%H:%M:%S")
bike$end_time <- format(as.POSIXct(bike$ended_at), format = "%H:%M:%S")
bike$start_date <- as.Date(bike$started_at)
bike$end_date <- as.Date(bike$ended_at)
# Check the structure again
str(bike)
## 'data.frame': 7416 obs. of 17 variables:
## $ ride_id : chr "726C3A99FFCAE10C" "F1AC3AED5E7498FB" "9C93876268A75FD7" "45AFFC2B7A7BD7C9" ...
## $ rideable_type : Factor w/ 3 levels "classic_bike",..: 1 1 1 1 3 3 1 1 1 1 ...
## $ started_at : chr "2022-08-18 18:08:30" "2022-08-11 18:28:21" "2022-08-28 19:40:43" "2022-08-15 20:21:00" ...
## $ ended_at : chr "2022-08-18 19:00:37" "2022-08-11 18:44:35" "2022-08-28 20:52:50" "2022-08-15 20:44:37" ...
## $ start_station_name: chr "Grandview Library at Oakland Ave" "Jaeger St & Whittier St" "High St & Crestview Rd" "High St & Crestview Rd" ...
## $ start_station_id : num 79 59 88 88 88 88 55 55 88 54 ...
## $ end_station_name : chr "Grandview Library at Oakland Ave" "Jaeger St & Whittier St" "High St & Crestview Rd" "High St & Crestview Rd" ...
## $ end_station_id : num 79 59 88 88 88 88 48 62 110 54 ...
## $ start_lat : num 40 39.9 40 40 40 ...
## $ start_lng : num -83 -83 -83 -83 -83 ...
## $ end_lat : num 40 39.9 40 40 40 ...
## $ end_lng : num -83 -83 -83 -83 -83 ...
## $ member_casual : Factor w/ 2 levels "casual","member": 2 2 2 1 1 1 1 2 2 1 ...
## $ start_time : chr "18:08:30" "18:28:21" "19:40:43" "20:21:00" ...
## $ end_time : chr "19:00:37" "18:44:35" "20:52:50" "20:44:37" ...
## $ start_date : Date, format: "2022-08-18" "2022-08-11" ...
## $ end_date : Date, format: "2022-08-18" "2022-08-11" ...
bike %>% count(member_casual)
## member_casual n
## 1 casual 4069
## 2 member 3347
ggplot(bike, aes(member_casual, fill = member_casual))+
geom_bar()+
scale_fill_brewer(palette = "BuPu")+
guides(fill="none")+
labs(title = "User membersip types", x= "types of memebership")+
theme_classic()
There are more causal users (24-hour pass or 3-day pass
user) than annual members users by around 1000
user difference on August 2022. Also, there are two types of the
causal users which are Single trip cost
2.25$ per 30min and 8$ for unlimited 30min ride in a day,
annual membership on the other hand cost 85$ a year.
ggplot(bike, aes(rideable_type, fill = rideable_type))+
geom_bar()+
scale_fill_brewer(palette = "BuPu")+
guides(fill="none")+
labs(title = "Types of used biks", x = "")+
theme_classic()
There are few users of docked_bike type comparing to the
others. Docked bike is a bicycles that can be borrowed or
rented from an automated station or “docking stations”. It is
interesting why would people prefer other types above this type!
Therefore we recommend the company to not invest in this type.
# grouping types of users and counting their used bike type without counting docked_bike because it is only 3 users
members_preferance <- bike %>% group_by(member_casual, rideable_type)%>%
filter(rideable_type != "docked_bike")%>%
summarise(used = n())
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
print(members_preferance)
## # A tibble: 4 × 3
## # Groups: member_casual [2]
## member_casual rideable_type used
## <fct> <fct> <int>
## 1 casual classic_bike 1689
## 2 casual electric_bike 2377
## 3 member classic_bike 1712
## 4 member electric_bike 1635
ggplot(members_preferance, aes(x= member_casual,y = used , fill = rideable_type))+
geom_bar(position='dodge', stat='identity')+
scale_fill_brewer(palette = "BuPu")+
labs(title = "Most used bike type to user", x= "type of user", y="")+
theme_classic()
While there is no huge difference between annual members in
choosing classic or electric bikes,
casual members choose to use electric bikes over
the classic by around 680 user.
#the probability of each user to pick this type of bike
round(table(bike$member_casual, bike$rideable_type), 2)
##
## classic_bike docked_bike electric_bike
## casual 1689 3 2377
## member 1712 0 1635
While there is almost even number of the annual member
choose electric or classic bike,casual
users are more likely to choose electric bike.
#extrat only the day and convert it to day of the week
bike$days <- format(bike$start_date, format = "%a")
#convert it to a factor and organize the days order
bike$days <- factor(bike$days, levels = c("Sat", "Sun", "Mon", "Tue", "Wed", "Thu", "Fri" ))
ggplot(bike, aes(days, fill = days))+
geom_bar()+
scale_fill_brewer(palette = "BuPu")+
guides(fill="none")+
labs(title = "Number of users in the days of the week", x="Days of the week")+
theme_classic()
Saturdays and Wednesdays have the most number of users but overall there is no big difference between the days of the week in the count of users.
#get only the hour from the time
bike$hour <- NA
bike$hour <- hour(bike$started_at)
sum_hour <- bike %>%
group_by(hour) %>%
summarise(sum_hour = length(hour))
ggplot(sum_hour, aes(hour, sum_hour ))+
geom_line(color = "#8C6BB1", size = 1) +
geom_point(color = "#8C96C6", size = 2) +
scale_x_continuous(breaks=seq(0,23,1))+
labs(title="Use by hour", y = "")+
theme_classic()
The peak hours of August is between 3:00pm to 8:00pm in range of 200 user.
sum_hour <- bike %>%
group_by(days, hour)%>% summarise(count = n())
## `summarise()` has grouped output by 'days'. You can override using the
## `.groups` argument.
ggplot(data = sum_hour, aes(x = hour, y = count, color = days))+
geom_point() + geom_line(aes(group = 1))+
facet_grid(rows = vars(days))+
scale_color_manual(values=c("#BFD3E6", "#9EBCDA" ,"#8C96C6" ,"#8C6BB1", "#88419D", "#810F7C", "#4D004B"))+
labs(title= "Use by day and hour")+
scale_y_continuous(breaks=seq(0,130,50))+
scale_x_continuous(breaks=seq(0,23,1))+
theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()
)
During the weekend hours, the rushing hour start at 9:00am while in the weekdays it starts earlier at 6:00am. Also, in most of the weekdays the line does not drop until 10:00pm but it drop a little earlier during the weekends at 9:00pm.
library(mapview)
#subset without the na
end_station <- subset(bike, (!is.na(bike[,11])) & (!is.na(bike[,12])))
#have the car for ohaio
mapview(bike, xcol = "start_lat", ycol = "start_lng", crs = 3730, grid = FALSE, lable = "Start Station")